The purpose of this file is to practice applying basic R commands learned through lecture videos and perform exploratory analysis on worldwide COVID-19 data.

library(dplyr)
library(lubridate)
library(knitr)
library(ggplot2)
library(cowplot)

Our World in Data GitHub COVID-19 outcomes

Data was read in and summarized as shown below: (current file is as of 5/31)

owid <- read.csv(url("https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv"), na.strings = c("", "NA"), colClasses = c("date" = "Date"), header = TRUE)
dim(owid)
## [1] 92885    59
head(owid)
str(owid)
summary(owid)

Full Cleaned Data Set

Data set, including possible confounders for use in later analysis, is cleaned and filtered below.

owid.filtered <- owid %>%
                  rename(iso.code = iso_code, totcases = total_cases, totdeaths = total_deaths, totcases.mil = total_cases_per_million, totdeaths.mil = total_deaths_per_million, hosp.patients = hosp_patients, hosp.patients.mil = hosp_patients_per_million, pop.density = population_density, med.age = median_age, older70 = aged_70_older, gdp.cap = gdp_per_capita, card.death = cardiovasc_death_rate, diab.prev = diabetes_prevalence, fem.smokers = female_smokers, male.smokers = male_smokers) %>%
                  select(c(iso.code:totcases, totdeaths, totcases.mil, totdeaths.mil, hosp.patients, hosp.patients.mil, population, pop.density, med.age, older70, gdp.cap, card.death, diab.prev, fem.smokers, male.smokers)) %>% #Renamed variables and filtered to keep only those of interest. 
                  filter(date == "2020-05-31") #Filtered out all but most recent date (most recent date is a running sum of total cases).
table(owid.filtered$continent, useNA = "ifany") #Data includes both continent and country-specific outcomes. Continent outcomes can be distinguished by "NA" in the continent column; the continent name is in the location column. 
## 
##        Africa          Asia        Europe North America       Oceania 
##            54            47            46            23             4 
## South America          <NA> 
##            12             9

More exploratory analysis of data frames:

dim(owid.filtered)
head(owid.filtered)
tail(owid.filtered)

Continent Analysis

Data is filtered to remove entries for specific country locations and include only data that aggregates COVID-19 outcomes on a continent and world scale.

owid.continents <- owid.filtered %>%
                    filter(is.na(continent)) %>%
                    select(-continent) #Used missingness properties to select only continent rows and then removed continent column. 
table(owid.continents$location)
## 
##         Africa           Asia         Europe European Union  International 
##              1              1              1              1              1 
##  North America        Oceania  South America          World 
##              1              1              1              1

Country Analysis

Data is filtered to include only country-specific COVID-19 outcomes and exclude continent and world-wide data.

owid.countries <- owid.filtered %>%
                    filter(!is.na(continent)) #Removed all rows with NA for continent, leaving behind only countries. 
table(owid.countries$continent)
## 
##        Africa          Asia        Europe North America       Oceania 
##            54            47            46            23             4 
## South America 
##            12

Tests for Missingness

table(owid.filtered$totcases, useNA = "ifany") %>%
  tail() #No missing values for total cases column. 
## 
## 1138778 1160536 1798793 1959812 2031973 6188259 
##       1       1       1       1       1       1
table(owid.filtered$totdeaths, useNA = "ifany") %>%
  tail() #Several missing values for total deaths column. Keep this in mind when analyzing. 
## 
## 107859 126360 127102 172683 375061   <NA> 
##      1      1      1      1      1     19
table(owid.filtered$hosp.patients, useNA = "ifany") %>%
  tail() #Most values are missing. Likely not useful for data analysis. 
## 
##  2116  2134  6822  7237 14271  <NA> 
##     1     1     1     1     1   169
which(is.na(owid.filtered$totdeaths))
##  [1]  22  31  51  57  63  72  98 101 118 123 138 148 149 150 156 177 182 189 191
#Missing countries: Butan, Cambodia, Dominica, Eritrea, Fiji, Grenada, Laos, Lesotho, Mongolia, Namibia, Saint Kitts and Nevis, Saint Lucia, Saint Vincent and the Grenadines, Seychelles, Timor, Uganda, Vatican, Vietnam.

Modified Data Frames

Filtered OWID data frames further narrowed to COVID-19 cases only, deaths only, and hospitalizations only, separated by continent and by country respectively.

#By continent:
select(owid.continents, c(iso.code:totcases, totcases.mil)) #COVID-19 cases by continent. 
select(owid.continents, c(iso.code:date, totdeaths, totdeaths.mil)) #COVID-19 deaths by continent. 
select(owid.continents, c(iso.code:date, hosp.patients, hosp.patients.mil)) #COVID-19 hospitalizations by continent. 

#By country:
select(owid.countries, c(iso.code:totcases, totcases.mil)) #COVID-19 cases by country. 
select(owid.countries, c(iso.code:date, totdeaths, totdeaths.mil)) #COVID-19 deaths by country. 
select(owid.countries, c(iso.code:date, hosp.patients, hosp.patients.mil)) #COVID-19 hospitalizations by country.  

Case Count Plots

Continent COVID-19 Plots

Plots for exploratory analysis comparing COVID-19 cases by continent for total cases, total cases per million people, total deaths, and total deaths per million people.

continents1 <- owid.continents %>%
                  filter(!(location %in% c("International", "World"))) %>%
                  ggplot(aes(x = location, y = totcases)) +
                  geom_bar(stat="identity") +
                  geom_text(aes(label = totcases), vjust = -0.25, size = 3) + 
                  labs(title = "Total COVID-19 Cases by Continent") +
                  labs(x = "Continent", y = "Total COVID-19 Cases") +
                  theme(axis.text.x = element_text(angle = 45, hjust = 1)) #Barplot of total cases by continent. 

continents2 <- owid.continents %>%
                  filter(!(location %in% c("International", "World"))) %>%
                  ggplot(aes(x = location, y = totcases.mil)) +
                  geom_bar(stat="identity") + 
                  geom_text(aes(label = totcases), vjust = -0.25, size = 3) + 
                  labs(title = "Total COVID-19 Cases per Million People by Continent") +
                  labs(x = "Continent", y = "Total COVID-19 Cases per Million") +
                  theme(axis.text.x = element_text(angle = 45, hjust = 1)) #Barplot of total cases per million by continent.

plot_grid(continents1, continents2, labels = "AUTO")

continents3 <- owid.continents %>%
                    filter(!(location %in% c("International", "World"))) %>%
                    ggplot(aes(x = location, y = totdeaths)) +
                    geom_bar(stat="identity") +
                    geom_text(aes(label = totcases), vjust = -0.25, size = 3) + 
                    labs(title = "Total COVID-19 Deaths by Continent") +
                    labs(x = "Continent", y = "Total COVID-19 Cases") +
                    theme(axis.text.x = element_text(angle = 45, hjust = 1)) #Barplot of total deaths by continent. 

continents4 <- owid.continents %>%
                    filter(!(location %in% c("International", "World"))) %>%
                    ggplot(aes(x = location, y = totdeaths.mil)) +
                    geom_bar(stat="identity") +
                    geom_text(aes(label = totcases), vjust = -0.25, size = 3) + 
                    labs(title = "Total COVID-19 Death per Million by Continent") +
                    labs(x = "Continent", y = "Total COVID-19 Cases") +
                    theme(axis.text.x = element_text(angle = 45, hjust = 1)) #Barplot of total deaths per million by continent.

plot_grid(continents3, continents4, labels = "AUTO")

Top 10 Country COVID-19 Plots

Barplots filtering out the 10 countries with the highest COVID-19 cases per million people on each continent, as well as a heatmap of all countries, are the outputs of the code below:

ctry.mean.cases <- owid.countries %>%
    group_by(continent) %>%
    summarise(totcases.mil = mean(totcases.mil, na.rm = TRUE)) #Mean COVID-19 cases per country, grouped by continent.
mean.cases <- function(x) {
                      ctry.mean.cases %>%
                      filter(continent == x) %>%
                      select(totcases.mil) %>%
                      as.numeric()
} #Function to produce numeric value of mean COVID-19 cases per country by continent to be used in bar graphs below. 
top.ctry.cases <- function(x) { 
                    owid.countries %>% 
                    filter(continent == x) %>%
                    arrange(desc(totcases.mil)) %>%
                    head(n=10) %>%
                    ggplot(aes(x = location, y = totcases.mil)) +
                        geom_bar(stat = "identity") +
                        geom_abline(slope=0, intercept= mean.cases(x),  col = "red", lty=2) +
                        labs(x = "Country", y = "Total COVID-19 Cases per Million People") +
                        theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
                        labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x)))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(top.ctry.cases(i))
} #Loops function for continents of interest. 

ctry.mean.deaths <- owid.countries %>%
    group_by(continent) %>%
    summarise(totdeaths.mil = mean(totdeaths.mil, na.rm = TRUE)) #Mean COVID-19 daeths per country, grouped by continent.
mean.deaths <- function(x) {
                      ctry.mean.deaths %>%
                      filter(continent == x) %>%
                      select(totdeaths.mil) %>%
                      as.numeric()
} #Function to produce numeric value of mean COVID-19 deaths per country by continent to be used in bar graphs below. 
top.ctry.deaths <- function(x) { 
                    owid.countries %>% 
                    filter(continent == x) %>%
                    arrange(desc(totdeaths.mil)) %>%
                    head(n=10) %>%
                    ggplot(aes(x = location, y = totdeaths.mil)) +
                        geom_bar(stat = "identity") +
                        geom_abline(slope=0, intercept= mean.deaths(x),  col = "red", lty=2) +
                        labs(x = "Country", y = "Total COVID-19 Deaths per Million People") +
                        theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
                        labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x)))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(top.ctry.deaths(i))
} #Loops function for continents of interest. 

Heatmaps of COVID-19 Cases Over Time, All Countries

owid %>%
  rename(iso.code = iso_code, totcases.mil = total_cases_per_million) %>%
  select(c(iso.code:date, totcases.mil)) %>%
  filter(!is.na(continent)) %>%
  ggplot(aes(x = date, y = location, fill = totcases.mil)) +
    geom_tile() +
    labs(title = "Total COVID-19 Cases per Million Over Time", size = 22) +
    scale_x_date(date_breaks = "1 month") +
    scale_fill_gradient(low = "#ffeda0", high = "#f03b20") +
    ggsave(file = "test.image.pdf", width = 20, height = 25) #Total cases per million over time by country as heatmap. 

owid %>%
  rename(iso.code = iso_code, new.cases = new_cases) %>%
  select(c(iso.code:date, new.cases)) %>%
  filter(!is.na(continent)) %>%
  ggplot(aes(x = date, y = location, fill = new.cases)) +
    geom_tile() +
    labs(title = "New COVID-19 Cases per Million Over Time", size = 22) +
    scale_x_date(date_breaks = "1 month") +
    scale_fill_gradient(low = "#ffeda0", high = "#f03b20", limits = c(0, 20000)) +
    ggsave(file = "test.image2.pdf", width = 20, height = 25) #New cases per million over time by country as heatmap. Difficult to scale because range is so large.  

Population Size and COVID-19 Outcomes

Population Size and COVID-19 Cases per Million

Population size and population density are evaluated below as confounding variables for COVID-19 cases. These variables are first analyzed on a full-world scale, and then by continent.

ggplot(data = owid.countries, aes(x = population, y = totcases.mil)) +
  geom_point(color = "#333aff") +
  geom_smooth(color = "red") +
  geom_smooth(method = "lm", color = "green") +
  theme_bw() +
  labs(x = "Population Size", y = "Total COVID-19 Cases per Million People") +
  labs(title = ("World COVID-19 Cases per Million People vs. Population Size")) #COVID-19 cases and population size on world scale. Red line is smooth fit; green line is linear fit. 

cor(owid.countries$totcases.mil, owid.countries$population, use = "complete.obs") 
## [1] -0.05455269
pop.lm.cases <- lm(totcases.mil ~ population, data = owid.countries)
pop.lm.cases
## 
## Call:
## lm(formula = totcases.mil ~ population, data = owid.countries)
## 
## Coefficients:
## (Intercept)   population  
##   1.378e+03   -9.934e-07
summary.lm(pop.lm.cases) #Summary statistics for linear fit. Not a significant relationship. 
## 
## Call:
## lm(formula = totcases.mil ~ population, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1375.3 -1273.2 -1085.4   170.6 18393.0 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.378e+03  2.092e+02   6.588 4.54e-10 ***
## population  -9.934e-07  1.340e-06  -0.741     0.46    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2750 on 184 degrees of freedom
## Multiple R-squared:  0.002976,   Adjusted R-squared:  -0.002443 
## F-statistic: 0.5492 on 1 and 184 DF,  p-value: 0.4596
ggplot(data = owid.countries, aes(x = pop.density, y = totcases.mil)) +
  geom_point(color = "#333aff") +
  geom_smooth(color = "red") +
  geom_smooth(method = "lm", color = "green") +
  theme_bw() +
  labs(x = "Population Density", y = "Total COVID-19 Cases per Million People") +
  labs(title = ("World COVID-19 Cases per Million People vs. Population Density")) #COVID-19 cases and population size on world scale. Red line is smooth fit; green line is linear fit. 

cor(owid.countries$totcases.mil, owid.countries$pop.density, use = "complete.obs") 
## [1] 0.09592666
pop.den.lm.cases <- lm(totcases.mil ~ pop.density, data = owid.countries)
pop.den.lm.cases
## 
## Call:
## lm(formula = totcases.mil ~ pop.density, data = owid.countries)
## 
## Coefficients:
## (Intercept)  pop.density  
##    1231.377        0.152
summary.lm(pop.den.lm.cases) #Summary statistics for linear fit. Not a significant relationship. 
## 
## Call:
## lm(formula = totcases.mil ~ pop.density, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2157.1 -1177.8  -994.2   130.6 18487.2 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1231.3772   195.4280   6.301 2.21e-09 ***
## pop.density    0.1520     0.1176   1.293    0.198    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2578 on 180 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.009202,   Adjusted R-squared:  0.003697 
## F-statistic: 1.672 on 1 and 180 DF,  p-value: 0.1977
cases.pop <- function(x) { 
                    owid.countries %>% 
                    filter(continent == x) %>%
                    ggplot(aes(x = population, y = totcases.mil)) +
                        geom_point(color = "#333aff") +
                        geom_smooth(color = "red") +
                        geom_smooth(method = "lm", color = "green") +
                        theme_bw() +
                        labs(x = "Population Size", y = "Total COVID-19 Cases per Million People") +
                        labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Population Size"))
} 
pop.lm.cases.cont <- function(x) {
                        owid.countries %>%
                        filter(continent == x) %>%
                        lm(totcases.mil ~ population,.) %>%
                        summary.lm() #Function for linear regression statistics to be included in for loop below. 
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
   print(cases.pop(i)) 
   print(pop.lm.cases.cont(i)) 
} #Loops function for continents of interest. 

## 
## Call:
## lm(formula = totcases.mil ~ population, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1497.85  -345.09  -175.71   -90.24  2662.88 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.970e+02  1.820e+02   2.182   0.0406 *  
## population  1.399e-05  2.434e-06   5.749 1.05e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 820.1 on 21 degrees of freedom
## Multiple R-squared:  0.6115, Adjusted R-squared:  0.593 
## F-statistic: 33.05 on 1 and 21 DF,  p-value: 1.047e-05

## 
## Call:
## lm(formula = totcases.mil ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2695.4 -2216.3 -1191.6   828.5 16764.2 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.007e+03  6.407e+02   4.694 2.63e-05 ***
## population  -6.117e-06  1.972e-05  -0.310    0.758    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3758 on 44 degrees of freedom
## Multiple R-squared:  0.002183,   Adjusted R-squared:  -0.0205 
## F-statistic: 0.09624 on 1 and 44 DF,  p-value: 0.7579

## 
## Call:
## lm(formula = totcases.mil ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1496.4 -1408.7 -1201.6   507.7 18248.0 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  1.509e+03  4.997e+02   3.020  0.00415 **
## population  -1.384e-06  1.671e-06  -0.828  0.41190   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3236 on 45 degrees of freedom
## Multiple R-squared:  0.01502,    Adjusted R-squared:  -0.006873 
## F-statistic: 0.686 on 1 and 45 DF,  p-value: 0.4119

## 
## Call:
## lm(formula = totcases.mil ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -321.11 -232.55 -199.00  -14.54 3069.34 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.283e+02  9.379e+01   3.500 0.000964 ***
## population  -2.899e-06  2.168e-06  -1.337 0.187074    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 564.8 on 52 degrees of freedom
## Multiple R-squared:  0.03323,    Adjusted R-squared:  0.01464 
## F-statistic: 1.787 on 1 and 52 DF,  p-value: 0.1871

## 
## Call:
## lm(formula = totcases.mil ~ population, data = .)
## 
## Residuals:
##       1       2       3       4 
##   16.16  -67.22  196.03 -144.97 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.078e+01  1.313e+02   0.615    0.601
## population  7.275e-06  9.564e-06   0.761    0.526
## 
## Residual standard error: 179.2 on 2 degrees of freedom
## Multiple R-squared:  0.2243, Adjusted R-squared:  -0.1635 
## F-statistic: 0.5785 on 1 and 2 DF,  p-value: 0.5263

## 
## Call:
## lm(formula = totcases.mil ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1446.1 -1227.4 -1084.0    11.2  5023.1 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.319e+03  7.569e+02   1.742    0.112
## population  6.354e-06  1.143e-05   0.556    0.591
## 
## Residual standard error: 2204 on 10 degrees of freedom
## Multiple R-squared:  0.02996,    Adjusted R-squared:  -0.06705 
## F-statistic: 0.3088 on 1 and 10 DF,  p-value: 0.5906
cases.pop.den <- function(x) { 
                    owid.countries %>% 
                    filter(continent == x) %>%
                    ggplot(aes(x = pop.density, y = totcases.mil)) +
                        geom_point(color = "#333aff") +
                        geom_smooth(color = "red") +
                        geom_smooth(method = "lm", color = "green") +
                        theme_bw() +
                        labs(x = "Population Density", y = "Total COVID-19 Cases per Million People") +
                        labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Population Density"))
} 
pop.den.lm.cases.cont <- function(x) {
                        owid.countries %>%
                        filter(continent == x) %>%
                        lm(totcases.mil ~ pop.density,.) %>%
                        summary.lm() #Function for linear regression statistics to be included in for loop below. 
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(cases.pop.den(i))
  print(pop.den.lm.cases.cont(i))
} #Loops function for continents of interest. 

## 
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1199.9  -668.7  -356.4    -7.5  4244.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 1292.670    405.934   3.184  0.00446 **
## pop.density   -2.893      1.689  -1.713  0.10149   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1232 on 21 degrees of freedom
## Multiple R-squared:  0.1226, Adjusted R-squared:  0.08078 
## F-statistic: 2.933 on 1 and 21 DF,  p-value: 0.1015

## 
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2356.3 -2006.6  -957.2   929.5 17129.3 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.633e+03  5.072e+02   5.191  5.4e-06 ***
## pop.density 1.617e-02  1.751e-01   0.092    0.927    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3332 on 43 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.0001984,  Adjusted R-squared:  -0.02305 
## F-statistic: 0.008531 on 1 and 43 DF,  p-value: 0.9268

## 
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3520.3 -1236.9 -1147.3   556.9 18445.3 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 1229.2082   523.4504   2.348   0.0235 *
## pop.density    0.3460     0.3177   1.089   0.2822  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3277 on 43 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.02684,    Adjusted R-squared:  0.004206 
## F-statistic: 1.186 on 1 and 43 DF,  p-value: 0.2822

## 
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -266.99 -236.07 -181.56  -44.54 3127.32 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 272.6243   102.1690   2.668   0.0102 *
## pop.density  -0.1263     0.6192  -0.204   0.8392  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 579.3 on 51 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.000815,   Adjusted R-squared:  -0.01878 
## F-statistic: 0.0416 on 1 and 51 DF,  p-value: 0.8392

## 
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
## 
## Residuals:
##       1       2       3       4 
##   26.13   12.58  136.11 -174.81 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  273.491    130.823   2.091    0.172
## pop.density   -5.367      4.677  -1.148    0.370
## 
## Residual standard error: 158 on 2 degrees of freedom
## Multiple R-squared:  0.397,  Adjusted R-squared:  0.09555 
## F-statistic: 1.317 on 1 and 2 DF,  p-value: 0.3699

## 
## Call:
## lm(formula = totcases.mil ~ pop.density, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1791.1 -1201.3  -903.1   -36.4  4919.4 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   934.07    1096.78   0.852    0.414
## pop.density    25.10      36.77   0.683    0.510
## 
## Residual standard error: 2187 on 10 degrees of freedom
## Multiple R-squared:  0.04454,    Adjusted R-squared:  -0.051 
## F-statistic: 0.4662 on 1 and 10 DF,  p-value: 0.5103

Population Size and COVID-19 Deaths per Million

Population size and population density are evaluated below as confounding variables for COVID-19 deaths. These variables are first analyzed on a full-world scale, and then by continent.

ggplot(data = owid.countries, aes(x = population, y = totdeaths.mil)) +
  geom_point(color = "#333aff") +
  geom_smooth(color = "red") +
  geom_smooth(method = "lm", color = "green") +
  theme_bw() +
  labs(x = "Population Size", y = "Total COVID-19 Deaths per Million People") +
  labs(title = ("World COVID-19 Deaths per Million People in vs. Population Size")) #COVID-19 deaths and population size on world scale. Red line is smooth fit; green line is linear fit. 

cor(owid.countries$totdeaths.mil, owid.countries$population, use = "complete.obs") 
## [1] -0.01846469
pop.lm.deaths <- lm(totdeaths.mil ~ population, data = owid.countries)
pop.lm.deaths
## 
## Call:
## lm(formula = totdeaths.mil ~ population, data = owid.countries)
## 
## Coefficients:
## (Intercept)   population  
##   6.043e+01   -1.815e-08
summary.lm(pop.lm.deaths) #Summary statistics for linear fit. Not a significant relationship. 
## 
## Call:
## lm(formula = totdeaths.mil ~ population, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -60.13  -57.43  -51.85  -26.37 1177.12 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.043e+01  1.259e+01   4.800 3.53e-06 ***
## population  -1.815e-08  7.652e-08  -0.237    0.813    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 156.4 on 165 degrees of freedom
##   (19 observations deleted due to missingness)
## Multiple R-squared:  0.0003409,  Adjusted R-squared:  -0.005718 
## F-statistic: 0.05628 on 1 and 165 DF,  p-value: 0.8128
ggplot(data = owid.countries, aes(x = pop.density, y = totdeaths.mil)) +
  geom_point(color = "#333aff") +
  geom_smooth(color = "red") +
  geom_smooth(method = "lm", color = "green") +
  theme_bw() +
  labs(x = "Population Density", y = "Total COVID-19 Deaths per Million People") +
  labs(title = ("World COVID-19 Deaths per Million People in vs. Population Density")) #COVID-19 deaths and population size on world scale. Red line is smooth fit; green line is linear fit. 

cor(owid.countries$totdeaths.mil, owid.countries$pop.density, use = "complete.obs") 
## [1] 0.01069476
pop.den.lm.deaths <- lm(totdeaths.mil ~ pop.density, data = owid.countries)
pop.den.lm.deaths
## 
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = owid.countries)
## 
## Coefficients:
## (Intercept)  pop.density  
##   6.033e+01    9.799e-04
summary.lm(pop.den.lm.deaths) #Summary statistics for linear fit. Not a significant relationship. 
## 
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -66.69  -57.74  -53.14  -26.85 1176.68 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.033e+01  1.260e+01   4.789 3.76e-06 ***
## pop.density 9.799e-04  7.199e-03   0.136    0.892    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 157.6 on 162 degrees of freedom
##   (22 observations deleted due to missingness)
## Multiple R-squared:  0.0001144,  Adjusted R-squared:  -0.006058 
## F-statistic: 0.01853 on 1 and 162 DF,  p-value: 0.8919
deaths.pop <- function(x) { 
                    owid.countries %>% 
                    filter(continent == x) %>%
                    ggplot(aes(x = population, y = totdeaths.mil)) +
                        geom_point(color = "#333aff") +
                        geom_smooth(color = "red") +
                        theme_bw() +
                        labs(x = "Population Size", y = "Total COVID-19 Deaths per Million People") +
                        labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Population Size"))
} 
pop.lm.deaths.cont <- function(x) {
                        owid.countries %>%
                        filter(continent == x) %>%
                        lm(totdeaths.mil ~ population,.) %>%
                        summary.lm() #Function for linear regression statistics to be included in for loop below. 
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(deaths.pop(i))
  print(pop.lm.deaths.cont(i))
} #Loops function for continents of interest. 

## 
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -58.65 -22.03 -15.34   8.50 155.68 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.990e+01  1.203e+01   1.654    0.118    
## population  8.978e-07  1.424e-07   6.306 1.04e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47.09 on 16 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.7131, Adjusted R-squared:  0.6952 
## F-statistic: 39.77 on 1 and 16 DF,  p-value: 1.045e-05

## 
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -295.08 -139.13 -109.91   22.27 1084.80 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 1.527e+02  4.546e+01   3.359  0.00165 **
## population  1.196e-06  1.384e-06   0.864  0.39225   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 262.7 on 43 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.01707,    Adjusted R-squared:  -0.005785 
## F-statistic: 0.7469 on 1 and 43 DF,  p-value: 0.3923

## 
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.333  -9.482  -6.405  -1.058  81.553 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.171e+01  3.120e+00   3.753 0.000569 ***
## population  -5.163e-09  9.759e-09  -0.529 0.599776    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.77 on 39 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.007125,   Adjusted R-squared:  -0.01833 
## F-statistic: 0.2799 on 1 and 39 DF,  p-value: 0.5998

## 
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.050 -4.417 -2.140  1.130 49.289 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.471e+00  1.514e+00   3.613 0.000734 ***
## population  -2.837e-08  3.370e-08  -0.842 0.404077    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.608 on 47 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.01486,    Adjusted R-squared:  -0.006102 
## F-statistic: 0.7089 on 1 and 47 DF,  p-value: 0.4041

## 
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
## 
## Residuals:
## ALL 2 residuals are 0: no residual degrees of freedom!
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  4.684e+00        NaN     NaN      NaN
## population  -2.529e-08        NaN     NaN      NaN
## 
## Residual standard error: NaN on 0 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:      1,  Adjusted R-squared:    NaN 
## F-statistic:   NaN on 1 and 0 DF,  p-value: NA

## 
## Call:
## lm(formula = totdeaths.mil ~ population, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -46.024 -35.593 -22.643   3.199 149.180 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.238e+01  2.145e+01   1.509    0.162
## population  4.972e-07  3.240e-07   1.534    0.156
## 
## Residual standard error: 62.45 on 10 degrees of freedom
## Multiple R-squared:  0.1906, Adjusted R-squared:  0.1096 
## F-statistic: 2.354 on 1 and 10 DF,  p-value: 0.1559
deaths.pop.den <- function(x) { 
                    owid.countries %>% 
                    filter(continent == x) %>%
                    ggplot(aes(x = pop.density, y = totdeaths.mil)) +
                        geom_point(color = "#333aff") +
                        geom_smooth(color = "red") +
                        geom_smooth(method = "lm", color = "green") +
                        theme_bw() +
                        labs(x = "Population Density", y = "Total COVID-19 Deaths per Million People") +
                        labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. Population Density"))
} 
pop.den.lm.cases.cont <- function(x) {
                        owid.countries %>%
                        filter(continent == x) %>%
                        lm(totdeaths.mil ~ pop.density,.) %>%
                        summary.lm() #Function for linear regression statistics to be included in for loop below. 
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(deaths.pop.den(i))
  print(pop.den.lm.cases.cont(i))
} #Loops function for continents of interest. 

## 
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -71.354 -45.176 -22.267   7.791 252.849 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  79.2768    28.0887   2.822   0.0123 *
## pop.density  -0.1761     0.1187  -1.483   0.1574  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 82.43 on 16 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.1209, Adjusted R-squared:  0.06593 
## F-statistic:   2.2 on 1 and 16 DF,  p-value: 0.1574

## 
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -168.39 -148.77 -121.59    2.42 1064.84 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 173.722636  40.334921   4.307 9.42e-05 ***
## pop.density  -0.001816   0.013921  -0.130    0.897    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 265 on 43 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.0003954,  Adjusted R-squared:  -0.02285 
## F-statistic: 0.01701 on 1 and 43 DF,  p-value: 0.8968

## 
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.516  -9.817  -6.638  -0.171  80.153 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.754973   3.283937   3.884  0.00041 ***
## pop.density -0.001575   0.001856  -0.848  0.40173    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.99 on 37 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.01908,    Adjusted R-squared:  -0.007434 
## F-statistic: 0.7196 on 1 and 37 DF,  p-value: 0.4017

## 
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.894 -4.129 -2.571  0.850 49.621 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 4.501368   1.599068   2.815  0.00716 **
## pop.density 0.002969   0.009547   0.311  0.75723   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.738 on 46 degrees of freedom
##   (6 observations deleted due to missingness)
## Multiple R-squared:  0.002098,   Adjusted R-squared:  -0.0196 
## F-statistic: 0.0967 on 1 and 46 DF,  p-value: 0.7572

## 
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
## 
## Residuals:
## ALL 2 residuals are 0: no residual degrees of freedom!
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  3.92739        NaN     NaN      NaN
## pop.density  0.03486        NaN     NaN      NaN
## 
## Residual standard error: NaN on 0 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:      1,  Adjusted R-squared:    NaN 
## F-statistic:   NaN on 1 and 0 DF,  p-value: NA

## 
## Call:
## lm(formula = totdeaths.mil ~ pop.density, data = .)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -75.83 -32.95   1.20  19.14  86.27 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  -3.4806    27.9420  -0.125   0.9033  
## pop.density   2.2013     0.9367   2.350   0.0406 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 55.72 on 10 degrees of freedom
## Multiple R-squared:  0.3558, Adjusted R-squared:  0.2913 
## F-statistic: 5.522 on 1 and 10 DF,  p-value: 0.04064

GDP and COVID-19 Outcomes

GDP and COVID-19 Cases per Million

First, a linear regression is performed on world COVID-19 cases and GDP. Then, scatter plots with a smooth line showing the relationship of GDP per capita and COVID-19 cases per million people are the outputs of the function below:

ggplot(data = owid.countries, aes(x = gdp.cap, y = totcases.mil)) +
  geom_point(color = "#333aff") +
  geom_smooth(color = "red") +
  geom_smooth(method = "lm", color = "green") +
  theme_bw() +
  labs(x = "GDP per Capita", y = "Total COVID-19 Cases per Million People") +
  labs(title = paste0("World COVID-19 Cases per Million People in vs. GDP per Capita")) #COVID-19 cases and GDP per capita on world scale. Red line is smooth fit; green line is linear fit. 

cor(owid.countries$totcases.mil, owid.countries$gdp.cap, use = "complete.obs") 
## [1] 0.6567062
gdp.lm.cases <- lm(totcases.mil ~ gdp.cap, data = owid.countries)
gdp.lm.cases
## 
## Call:
## lm(formula = totcases.mil ~ gdp.cap, data = owid.countries)
## 
## Coefficients:
## (Intercept)      gdp.cap  
##  -358.20891      0.08419
summary.lm(gdp.lm.cases) #Summary statistics for linear fit. 
## 
## Call:
## lm(formula = totcases.mil ~ gdp.cap, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5365.0  -832.3   -60.2   312.2 15342.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.582e+02  1.985e+02  -1.804   0.0729 .  
## gdp.cap      8.419e-02  7.288e-03  11.552   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1911 on 176 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.4313, Adjusted R-squared:  0.428 
## F-statistic: 133.5 on 1 and 176 DF,  p-value: < 2.2e-16
cases.gdp <- function(x) { 
                    owid.countries %>% 
                    filter(continent == x) %>%
                    ggplot(aes(x = gdp.cap, y = totcases.mil)) +
                        geom_point(color = "#333aff") +
                        geom_smooth(color = "red") +
                        geom_smooth(method = "lm", color = "green") +
                        theme_bw() +
                        labs(x = "GDP per Capita", y = "Total COVID-19 Cases per Million People") +
                        labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. GDP per Capita"))
} 
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(cases.gdp(i))
} #Loops function for continents of interest. 

GDP and COVID-19 Deaths per Million

First, a linear regression is performed on world COVID-19 deaths and GDP. Scatter plots with a smooth line showing the relationship of GDP per capita and COVID-19 deaths per million people are the outputs of the function below:

ggplot(data = owid.countries, aes(x = gdp.cap, y = totdeaths.mil)) +
  geom_point(color = "#333aff") +
  geom_smooth(color = "red") +
  geom_smooth(method = "lm", color = "green") +
  theme_bw() +
  labs(x = "GDP per Capita", y = "Total COVID-19 Deaths per Million People") +
  labs(title = paste0("World COVID-19 Deaths per Million People in vs. GDP per Capita")) #COVID-19 deaths and GDP per capita on world scale. Red line is smooth fit; green line is linear fit. 

cor(owid.countries$totdeaths.mil, owid.countries$gdp.cap, use = "complete.obs") 
## [1] 0.3780182
gdp.lm.deaths <- lm(totdeaths.mil ~ gdp.cap, data = owid.countries)
gdp.lm.deaths
## 
## Call:
## lm(formula = totdeaths.mil ~ gdp.cap, data = owid.countries)
## 
## Coefficients:
## (Intercept)      gdp.cap  
##    1.167475     0.002812
summary.lm(gdp.lm.deaths) #Summary statistics for linear fit. 
## 
## Call:
## lm(formula = totdeaths.mil ~ gdp.cap, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -316.82  -41.37  -15.05   -2.98 1076.48 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.167e+00  1.560e+01   0.075     0.94    
## gdp.cap     2.812e-03  5.479e-04   5.132  8.3e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 140.9 on 158 degrees of freedom
##   (26 observations deleted due to missingness)
## Multiple R-squared:  0.1429, Adjusted R-squared:  0.1375 
## F-statistic: 26.34 on 1 and 158 DF,  p-value: 8.298e-07
deaths.gdp <- function(x) { 
                    owid.countries %>% 
                    filter(continent == x) %>%
                    ggplot(aes(x = gdp.cap, y = totdeaths.mil)) +
                        geom_point(color = "#333aff") +
                        geom_smooth(color = "red") +
                        geom_smooth(method = "lm", color = "green") +
                        theme_bw() +
                        labs(x = "GDP per Capita", y = "Total COVID-19 Deaths per Million People") +
                        labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. GDP per Capita"))
} 
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(deaths.gdp(i))
} #Loops function for continents of interest. 

Median Age and COVID-19 Outcomes

Median Age and COVID-19 Cases per Million

First, scatter plot shows all countries and the relationship between median age and COVID-19 outcomes. Next, median age is cut in two halves and converted to factors with labels “below.med” and “above.med”. A bar plot shows COVID-19 cases per million for each country separated by continent, with fill color indicating whether the median population age is above or below the global median.

ggplot(data = owid.countries, aes(x = med.age, y = totcases.mil)) +
  geom_point(color = "#333aff", alpha = 1) +
  geom_smooth(color = "red") +
  geom_smooth(method = "lm", color = "green") +
  theme_bw() +
  labs(x = "Median Age", y = "Total COVID-19 Cases per Million People") +
  labs(title = "World COVID-19 Cases per Million People vs. Median Population Age") #COVID-19 cases vs. median age on world scale. 

cor(owid.countries$totcases.mil, owid.countries$med.age, use = "complete.obs")
## [1] 0.326925
age.lm.cases <- lm(totcases.mil ~ med.age, data = owid.countries)
age.lm.cases
## 
## Call:
## lm(formula = totcases.mil ~ med.age, data = owid.countries)
## 
## Coefficients:
## (Intercept)      med.age  
##    -1161.02        74.91
summary.lm(age.lm.cases) #Summary statistics for linear fit. 
## 
## Call:
## lm(formula = totcases.mil ~ med.age, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2317.4  -954.8  -304.1    25.1 18524.4 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1161.02     518.02  -2.241   0.0263 *  
## med.age        74.91      16.32   4.589 8.43e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1998 on 176 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.1069, Adjusted R-squared:  0.1018 
## F-statistic: 21.06 on 1 and 176 DF,  p-value: 8.428e-06
exp.model.cases <- lm(log(totcases.mil) ~ med.age, data = owid.countries)
exp.model.cases
## 
## Call:
## lm(formula = log(totcases.mil) ~ med.age, data = owid.countries)
## 
## Coefficients:
## (Intercept)      med.age  
##      1.6876       0.1259
summary.lm(exp.model.cases)
## 
## Call:
## lm(formula = log(totcases.mil) ~ med.age, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6445 -1.0443  0.0571  1.1717  4.1880 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.68764    0.43260   3.901 0.000136 ***
## med.age      0.12587    0.01363   9.234  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.668 on 176 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.3263, Adjusted R-squared:  0.3225 
## F-statistic: 85.26 on 1 and 176 DF,  p-value: < 2.2e-16
summarise(owid.countries, med.age = mean(med.age, na.rm = TRUE)) #Mean of the median age of all countries.
##    med.age
## 1 30.37978
cases.med.age <- function(x) { 
                    owid.countries %>%
                    mutate(med.age = cut(med.age, breaks = c(0, 30.38, 48.3), labels = c("below.med", "above.med"))) %>%
                    mutate(med.age <- as.factor(med.age)) %>%
                    filter(continent == x) %>%
                    ggplot(aes(x = location, y = totcases.mil, fill = factor(med.age))) +
                        geom_bar(stat = "identity", position = "dodge") +
                        theme_bw() +
                        labs(x = "Country", y = "Total COVID-19 Cases per Million People") +
                        scale_fill_brewer(palette = "Set1") +
                        theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
                        labs(title = paste0("COVID-19 Cases per Million People by Country in ", (continent = x), " and Median Age"))
} 
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(cases.med.age(i))
} #Loops function for continents of interest.

Median Age and COVID-19 Deaths per Million

First, scatter plot shows all countries and the relationship between median age and COVID-19 deaths. Next, median age is cut in two halves and converted to factors with labels “below.med” and “above.med”. A bar plot shows COVID-19 daeths per million for each country separated by continent, with fill color indicating whether the median population age is above or below the global median.

ggplot(data = owid.countries, aes(x = med.age, y = totdeaths.mil)) +
  geom_point(color = "#333aff", alpha = 1) +
  geom_smooth(color = "red") +
  geom_smooth(method = "lm", color = "green") +
  theme_bw() +
  labs(x = "Median Age", y = "Total COVID-19 Deaths per Million People") +
  labs(title = "World COVID-19 Deaths per Million People vs. Median Population Age") #COVID-19 deaths vs. median age on world scale. 

cor(owid.countries$totdeaths.mil, owid.countries$med.age, use = "complete.obs")
## [1] 0.3983249
age.lm.deaths <- lm(totdeaths.mil ~ med.age, data = owid.countries)
age.lm.deaths
## 
## Call:
## lm(formula = totdeaths.mil ~ med.age, data = owid.countries)
## 
## Coefficients:
## (Intercept)      med.age  
##     -106.27         5.04
summary.lm(age.lm.deaths) #Summary statistics for linear fit. 
## 
## Call:
## lm(formula = totdeaths.mil ~ med.age, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -129.54  -51.45  -15.42   13.50  712.47 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -106.2735    29.5129  -3.601 0.000423 ***
## med.age        5.0396     0.9175   5.493 1.53e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 109.1 on 160 degrees of freedom
##   (24 observations deleted due to missingness)
## Multiple R-squared:  0.1587, Adjusted R-squared:  0.1534 
## F-statistic: 30.17 on 1 and 160 DF,  p-value: 1.525e-07
exp.model.deaths <- lm(log(totdeaths.mil) ~ med.age, data = owid.countries)
exp.model.deaths
## 
## Call:
## lm(formula = log(totdeaths.mil) ~ med.age, data = owid.countries)
## 
## Coefficients:
## (Intercept)      med.age  
##     -2.1931       0.1377
summary.lm(exp.model.deaths)
## 
## Call:
## lm(formula = log(totdeaths.mil) ~ med.age, data = owid.countries)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8420 -1.0420  0.1639  1.0674  3.6210 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.19314    0.44359  -4.944 1.92e-06 ***
## med.age      0.13770    0.01379   9.986  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.64 on 160 degrees of freedom
##   (24 observations deleted due to missingness)
## Multiple R-squared:  0.3839, Adjusted R-squared:  0.3801 
## F-statistic: 99.71 on 1 and 160 DF,  p-value: < 2.2e-16
summarise(owid.countries, med.age = mean(med.age, na.rm = TRUE)) #Mean of the median age of all countries.
##    med.age
## 1 30.37978
deaths.med.age <- function(x) { 
                    owid.countries %>%
                    mutate(med.age = cut(med.age, breaks = c(0, 30.38, 48.3), labels = c("below.med", "above.med"))) %>%
                    mutate(med.age <- as.factor(med.age)) %>%
                    filter(continent == x) %>%
                    ggplot(aes(x = location, y = totdeaths.mil, fill = factor(med.age))) +
                        geom_bar(stat = "identity", position = "dodge") +
                        theme_bw() +
                        labs(x = "Country", y = "Total COVID-19 Deaths per Million People") +
                        scale_fill_brewer(palette = "Set1") +
                        theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
                        labs(title = paste0("COVID-19 Deaths per Million People by Country in ", (continent = x), " and Median Age"))
} 
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(deaths.med.age(i))
} #Loops function for continents of interest.

Smoking and COVID-19 Outcomes

Smoking and COVID-19 Cases per Million

Graphs below show percentages of female and male smokers by country and the correlation to COVID-19 cases.

ggplot(data = owid.countries, aes(x = fem.smokers, y = totcases.mil)) +
  geom_point(color = "#333aff", alpha = 1) +
  geom_smooth(color = "red") +
  theme_bw() +
  labs(x = "Percentage of Female Smokers by Country", y = "Total COVID-19 Cases per Million People") +
  labs(title = "World COVID-19 Cases per Million People vs. Percentage of Female Smokers") #COVID-19 cases vs. female smokers on world scale. 

cor(owid.countries$totcases.mil, owid.countries$fem.smokers, use = "complete.obs") 
## [1] 0.2013998
ggplot(data = owid.countries, aes(x = male.smokers, y = totcases.mil)) +
  geom_point(color = "#333aff", alpha = 1) +
  geom_smooth(color = "red") +
  theme_bw() +
  labs(x = "Percentage of Male Smokers by Country", y = "Total COVID-19 Cases per Million People") +
  labs(title = "World COVID-19 Cases per Million People vs. Percentage of Male Smokers") #COVID-19 cases vs. male smokers on world scale. 

cor(owid.countries$totcases.mil, owid.countries$male.smokers, use = "complete.obs") 
## [1] -0.06492578
cases.smoking.f <- function(x) {
                    owid.countries %>%
                    filter(continent == x) %>%
                    ggplot(aes(x = fem.smokers, y = totcases.mil)) +
                      geom_point(color = "#333aff", alpha = 1) +
                      geom_smooth(color = "red") +
                      theme_bw() +
                      labs(x = "Percentage of Female Smokers in Country", y = "Total COVID-19 Cases per Million People") +
                      labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Percentage of Female Smokers"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(cases.smoking.f(i)) #Apply function to each continent. 
}

cases.smoking.m <- function(x) {
                    owid.countries %>%
                    filter(continent == x) %>%
                    ggplot(aes(x = male.smokers, y = totcases.mil)) +
                      geom_point(color = "#333aff", alpha = 1) +
                      geom_smooth(color = "red") +
                      theme_bw() +
                      labs(x = "Percentage of Male Smokers in Country", y = "Total COVID-19 Cases per Million People") +
                      labs(title = paste0("COVID-19 Cases per Million People in ", (continent = x), " vs. Percentage of Male Smokers"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(cases.smoking.m(i)) #Apply function to each continent. 
}

Smoking and COVID-19 Deaths per Million

Graphs below show percentages of female and male smokers in the world and by country and the correlation to COVID-19 deaths.

ggplot(data = owid.countries, aes(x = fem.smokers, y = totdeaths.mil)) +
  geom_point(color = "#333aff", alpha = 1) +
  geom_smooth(color = "red") +
  theme_bw() +
  labs(x = "Percentage of Female Smokers by Country", y = "Total COVID-19 Deaths per Million People") +
  labs(title = "World COVID-19 Deaths per Million People vs. Percentage of Female Smokers") #COVID-19 deaths vs. female smokers on world scale. 

cor(owid.countries$totdeaths.mil, owid.countries$fem.smokers, use = "complete.obs") 
## [1] 0.4070555
ggplot(data = owid.countries, aes(x = male.smokers, y = totdeaths.mil)) +
  geom_point(color = "#333aff", alpha = 1) +
  geom_smooth(color = "red") +
  theme_bw() +
  labs(x = "Percentage of Male Smokers by Country", y = "Total COVID-19 Deaths per Million People") +
  labs(title = "World COVID-19 Deaths per Million People vs. Percentage of Male Smokers") #COVID-19 deaths vs. male smokers on world scale. 

cor(owid.countries$totdeaths.mil, owid.countries$male.smokers, use = "complete.obs") 
## [1] -0.09632508
deaths.smoking.f <- function(x) {
                    owid.countries %>%
                    filter(continent == x) %>%
                    ggplot(aes(x = fem.smokers, y = totdeaths.mil)) +
                      geom_point(color = "#333aff", alpha = 1) +
                      geom_smooth(color = "red") +
                      theme_bw() +
                      labs(x = "Percentage of Female Smokers in Country", y = "Total COVID-19 Deaths per Million People") +
                      labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. Percentage of Female Smokers"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(deaths.smoking.f(i)) #Apply function to each continent. 
}

deaths.smoking.m <- function(x) {
                    owid.countries %>%
                    filter(continent == x) %>%
                    ggplot(aes(x = male.smokers, y = totdeaths.mil)) +
                      geom_point(color = "#333aff", alpha = 1) +
                      geom_smooth(color = "red") +
                      theme_bw() +
                      labs(x = "Percentage of Male Smokers in Country", y = "Total COVID-19 Deaths per Million People") +
                      labs(title = paste0("COVID-19 Deaths per Million People in ", (continent = x), " vs. Percentage of Male Smokers"))
}
for (i in c("North America", "Europe", "Asia", "Africa", "Oceania", "South America")) {
  print(deaths.smoking.m(i)) #Apply function to each continent. 
}

Geospatial Analysis

Downloading World Map File

World map file is read in and prepared for overlaying with COVID-19 data.

library(sf)
#File downloaded from https://hub.arcgis.com/datasets/esri::world-countries-generalized-1/explore and moved to working directory. 
world.map.zip <- "/Users/alanaschreibman/NO2_COVID-19.zip"
if (file.exists("World_Countries_.zip")) file.remove("World_Countries_.zip")
world.sf <- st_read("country_gen_trim.shp")  # Read shapefile as an sf object
## Reading layer `country_gen_trim' from data source `/Users/alanaschreibman/NO2_COVID-19/country_gen_trim.shp' using driver `ESRI Shapefile'
## Simple feature collection with 256 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -180 ymin: -89 xmax: 180 ymax: 83.6236
## Geodetic CRS:  WGS 84
class(world.sf)
## [1] "sf"         "data.frame"
head(world.sf)
## Simple feature collection with 6 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -176.6431 ymin: -21.94084 xmax: -138.8098 ymax: 0.808609
## Geodetic CRS:  WGS 84
##   OBJECTID FIPS_CNTRY ISO_2DIGIT ISO_3DIGIT             NAME    COUNTRYAFF
## 1        1         AQ         AS        ASM   American Samoa United States
## 2        2         WQ         UM        UMI            Baker United States
## 3        3         CW         CK        COK     Cook Islands   New Zealand
## 4        4         FP         PF        PYF French Polynesia        France
## 5        5         WQ         UM        UMI          Howland United States
## 6        6         WQ         UM        UMI           Jarvis United States
##   CONTINENT SHAPE_Leng   SHAPE_Area                       geometry
## 1   Oceania 0.60012436 1.371972e-02 MULTIPOLYGON (((-170.7439 -...
## 2   Oceania 0.02887532 3.430265e-05 MULTIPOLYGON (((-176.4614 0...
## 3   Oceania 0.98066416 1.307346e-02 MULTIPOLYGON (((-159.747 -2...
## 4   Oceania 3.93021062 1.753321e-01 MULTIPOLYGON (((-149.1792 -...
## 5   Oceania 0.05303007 1.811934e-04 MULTIPOLYGON (((-176.6362 0...
## 6   Oceania 0.09758213 6.658148e-04 MULTIPOLYGON (((-160.0211 -...
world.geo <- st_geometry(world.sf)
plot(world.geo)

plot(world.geo, col = "lemonchiffon2")

plot(world.geo, lwd = 0.5, border = "red")

Geospatial Distribution of COVID-19 Cases

Joins OWID data set with world map data frame and then use country cases to shade countries accordingly.

library(RColorBrewer)
library(viridis)

my_theme <- function() {
  theme_minimal() +                                  
  theme(axis.line = element_blank(),
  plot.title  = element_text(size=30),
  axis.text = element_blank(),                 
  axis.title = element_blank(),
  panel.grid = element_line(color = "white"),  
  legend.key.size = unit(0.8, "cm"),          
  legend.text = element_text(size = 20),      
  legend.title = element_text(size = 20))
}

myPalette <- colorRampPalette(brewer.pal(9, "BuPu")) 

world.sf[105, "NAME"] <- "Democratic Republic of Congo"
world.sf <- world.sf %>%
  rename(location = NAME) %>% 
  inner_join(owid.countries, world.sf, by = "location") #Joined OWID data set with .sph file for geospatial analysis.  
ggplot(data = world.sf, aes(fill = totcases.mil),
    lwd = 0.05) +
  geom_sf() +
  my_theme() +
  labs(title = "Population-Adjusted Distribution of COVID-19 Cases") +
  scale_fill_gradientn(name = "COVID-19 \ncases (per million)",  
                    limits = c(0, 6500), 
                    colours = myPalette(100)) #Plot of world distribution of COVID-19 cases with color shading. 

Geospatial Distribution of COVID-19 Deaths

Uses merged data frame and shades deaths by country accordingly.

ggplot(data = world.sf, aes(fill = totdeaths.mil),
    lwd = 0.05) +
  geom_sf() +
  my_theme() +
  labs(title = "Population-Adjusted Distribution of COVID-19 Deaths") +
  scale_fill_gradientn(name = "COVID-19 \ndeaths (per million)",  
                    limits = c(0, 700), 
                    colours = myPalette(100)) #Plot of world distribution of COVID-19 deaths with color shading. 

Geospatial Distribution of GDP per Capita

Uses merged data frame and shades GDP per capita by country accordingly.

ggplot(data = world.sf, aes(fill = gdp.cap),
    lwd = 0.05) +
  geom_sf() +
  my_theme() +
  labs(title = "Global Distribution of GDP per Capita") +
  scale_fill_gradientn(name = "GDP \nper capita",  
                    limits = c(0, 60000), 
                    colours = myPalette(100)) 

Geospatial Distribution of Population Density

Uses merged data frame and shades population density by country accordingly.

ggplot(data = world.sf, aes(fill = pop.density),
    lwd = 0.05) +
  geom_sf() +
  my_theme() +
  labs(title = "Global Distribution of Population Density") +
  scale_fill_gradientn(name = "Population \ndensity",  
                    limits = c(0, 450), 
                    colours = myPalette(100)) #Plot of world distribution of COVID-19 deaths with color shading. 

Geospatial Distribution of Median Age

Uses merged data frame and shades median age by country accordingly.

ggplot(data = world.sf, aes(fill = med.age),
    lwd = 0.05) +
  geom_sf() +
  my_theme() +
  labs(title = "Global Distribution of Median Age") +
  scale_fill_gradientn(name = "Median \n age",  
                    limits = c(0, 50), 
                    colours = myPalette(100)) 

Geospatial Distribution of Smoking

Uses merged data frame and shades male and female smoking percentages by country accordingly.

ggplot(data = world.sf, aes(fill = fem.smokers),
    lwd = 0.05) +
  geom_sf() +
  my_theme() +
  labs(title = "Percentage of Female Smokers by Country") +
  scale_fill_gradientn(name = "Female \nsmokers (%)",  
                    limits = c(0, 40), 
                    colours = myPalette(100)) 

ggplot(data = world.sf, aes(fill = male.smokers),
    lwd = 0.05) +
  geom_sf() +
  my_theme() +
  labs(title = "Percentage of Male Smokers by Country") +
  scale_fill_gradientn(name = "Male \nsmokers (%)",  
                    limits = c(0, 80), 
                    colours = myPalette(100)) 

Geospatial Distribution of Diabetes

Uses merged data frame and shades diabetes prevalence by country accordingly.

ggplot(data = world.sf, aes(fill = diab.prev),
    lwd = 0.05) +
  geom_sf() +
  my_theme() +
  labs(title = "Diabetes Prevalence by Country") +
  scale_fill_gradientn(name = "Diabetes \nprevalence (%)",  
                    limits = c(0, 20), 
                    colours = myPalette(100)) 

Geospatial Distribution of Cardiovascular Death

Uses merged data frame and shades indicence of cardiovascular death by country accordingly.

ggplot(data = world.sf, aes(fill = card.death),
    lwd = 0.05) +
  geom_sf() +
  my_theme() +
  labs(title = "Incidence of Cardiovascular Death by Country") +
  scale_fill_gradientn(name = "Diabetes \nprevalence (%)",  
                    limits = c(0, 800), 
                    colours = myPalette(100))